-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] llvm.amdgcn.raw.buffer.load.format intrinsic supports v4i32 as return type. #116067
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: None (sstipano) Changes…4i32 as return type. This went unnoticed until CreateIntrinsic with implicit mangling was used. Full diff: https://github.com/llvm/llvm-project/pull/116067.diff 3 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d6375ab77cfb32..492868b8bd099d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1162,7 +1162,7 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
// all: volatile op (bit 31, stripped at lowering)
[IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
+def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad;
def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
index 23efaa4d2bd91e..baeb5909f04e8d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
@@ -169,6 +169,62 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
ret <4 x float> %val
}
+define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ ;
+ ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret <4 x i32> %val
+}
+
; Waterfall for rsrc and soffset, copy for voffset
define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
; GFX8-LABEL: name: raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset
@@ -325,9 +381,68 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
ret <4 x float> %val
}
+define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
+ ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ ;
+ ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ %voffset = add i32 %voffset.base, 4095
+ %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret <4 x i32> %val
+}
+
+
declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32 immarg) #0
declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32 immarg) #0
declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32 immarg) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32 immarg) #0
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32 immarg) #0
attributes #0 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
index 60c6268e448cb2..29efdddac39b28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
@@ -17,6 +17,25 @@ main_body:
ret {<4 x float>, <4 x float>, <4 x float>} %r2
}
+;CHECK-LABEL: {{^}}buffer_load_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
+;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc
+;CHECK: s_waitcnt
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_v4i32(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0)
+ %data_glc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 1)
+ %data_slc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 2)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ %fdata_glc = bitcast <4 x i32> %data_glc to <4 x float>
+ %fdata_slc = bitcast <4 x i32> %data_slc to <4 x float>
+ %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %fdata, 0
+ %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %fdata_glc, 1
+ %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %fdata_slc, 2
+ ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
;CHECK-LABEL: {{^}}buffer_load_immoffs:
;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
;CHECK: s_waitcnt
@@ -26,6 +45,16 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_immoffs_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_v4i32(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 42, i32 0, i32 0)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ ret <4 x float> %fdata
+}
+
;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
@@ -43,6 +72,26 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large_v4i32:
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
+;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092
+;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large_v4i32(<4 x i32> inreg) {
+main_body:
+ %d.0 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 60, i32 0)
+ %d.1 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 32764, i32 0)
+ %d.2 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4, i32 36860, i32 0)
+ %fd.0 = bitcast <4 x i32> %d.0 to <4 x float>
+ %fd.1 = bitcast <4 x i32> %d.1 to <4 x float>
+ %fd.2 = bitcast <4 x i32> %d.2 to <4 x float>
+ %d.3 = fadd <4 x float> %fd.0, %fd.1
+ %data = fadd <4 x float> %fd.2, %d.3
+ ret <4 x float> %data
+}
+
;CHECK-LABEL: {{^}}buffer_load_ofs:
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
;CHECK: s_waitcnt
@@ -52,6 +101,16 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_ofs_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_v4i32(<4 x i32> inreg, i32) {
+main_body:
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %1, i32 0, i32 0)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ ret <4 x float> %fdata
+}
+
;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
;CHECK: s_waitcnt
@@ -62,6 +121,17 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm_v4i32(<4 x i32> inreg, i32) {
+main_body:
+ %ofs = add i32 %1, 60
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %ofs, i32 0, i32 0)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ ret <4 x float> %fdata
+}
+
;CHECK-LABEL: {{^}}buffer_load_x:
;CHECK: buffer_load_format_x v0, off, s[0:3], 0
;CHECK: s_waitcnt
@@ -83,5 +153,6 @@ main_body:
declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #0
declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32) #0
attributes #0 = { nounwind readonly }
|
|
@llvm/pr-subscribers-llvm-ir Author: None (sstipano) Changes…4i32 as return type. This went unnoticed until CreateIntrinsic with implicit mangling was used. Full diff: https://github.com/llvm/llvm-project/pull/116067.diff 3 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d6375ab77cfb32..492868b8bd099d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1162,7 +1162,7 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
// all: volatile op (bit 31, stripped at lowering)
[IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
+def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad;
def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
index 23efaa4d2bd91e..baeb5909f04e8d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
@@ -169,6 +169,62 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
ret <4 x float> %val
}
+define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ ;
+ ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret <4 x i32> %val
+}
+
; Waterfall for rsrc and soffset, copy for voffset
define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
; GFX8-LABEL: name: raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset
@@ -325,9 +381,68 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
ret <4 x float> %val
}
+define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
+ ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ ;
+ ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ %voffset = add i32 %voffset.base, 4095
+ %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret <4 x i32> %val
+}
+
+
declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32 immarg) #0
declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32 immarg) #0
declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32 immarg) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32 immarg) #0
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32 immarg) #0
attributes #0 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
index 60c6268e448cb2..29efdddac39b28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
@@ -17,6 +17,25 @@ main_body:
ret {<4 x float>, <4 x float>, <4 x float>} %r2
}
+;CHECK-LABEL: {{^}}buffer_load_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
+;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc
+;CHECK: s_waitcnt
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_v4i32(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0)
+ %data_glc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 1)
+ %data_slc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 2)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ %fdata_glc = bitcast <4 x i32> %data_glc to <4 x float>
+ %fdata_slc = bitcast <4 x i32> %data_slc to <4 x float>
+ %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %fdata, 0
+ %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %fdata_glc, 1
+ %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %fdata_slc, 2
+ ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
;CHECK-LABEL: {{^}}buffer_load_immoffs:
;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
;CHECK: s_waitcnt
@@ -26,6 +45,16 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_immoffs_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_v4i32(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 42, i32 0, i32 0)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ ret <4 x float> %fdata
+}
+
;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
@@ -43,6 +72,26 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large_v4i32:
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
+;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092
+;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large_v4i32(<4 x i32> inreg) {
+main_body:
+ %d.0 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 60, i32 0)
+ %d.1 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 32764, i32 0)
+ %d.2 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4, i32 36860, i32 0)
+ %fd.0 = bitcast <4 x i32> %d.0 to <4 x float>
+ %fd.1 = bitcast <4 x i32> %d.1 to <4 x float>
+ %fd.2 = bitcast <4 x i32> %d.2 to <4 x float>
+ %d.3 = fadd <4 x float> %fd.0, %fd.1
+ %data = fadd <4 x float> %fd.2, %d.3
+ ret <4 x float> %data
+}
+
;CHECK-LABEL: {{^}}buffer_load_ofs:
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
;CHECK: s_waitcnt
@@ -52,6 +101,16 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_ofs_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_v4i32(<4 x i32> inreg, i32) {
+main_body:
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %1, i32 0, i32 0)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ ret <4 x float> %fdata
+}
+
;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
;CHECK: s_waitcnt
@@ -62,6 +121,17 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm_v4i32(<4 x i32> inreg, i32) {
+main_body:
+ %ofs = add i32 %1, 60
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %ofs, i32 0, i32 0)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ ret <4 x float> %fdata
+}
+
;CHECK-LABEL: {{^}}buffer_load_x:
;CHECK: buffer_load_format_x v0, off, s[0:3], 0
;CHECK: s_waitcnt
@@ -83,5 +153,6 @@ main_body:
declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #0
declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32) #0
attributes #0 = { nounwind readonly }
|
|
@llvm/pr-subscribers-llvm-globalisel Author: None (sstipano) Changes…4i32 as return type. This went unnoticed until CreateIntrinsic with implicit mangling was used. Full diff: https://github.com/llvm/llvm-project/pull/116067.diff 3 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d6375ab77cfb32..492868b8bd099d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1162,7 +1162,7 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
// all: volatile op (bit 31, stripped at lowering)
[IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
+def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad;
def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
index 23efaa4d2bd91e..baeb5909f04e8d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
@@ -169,6 +169,62 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
ret <4 x float> %val
}
+define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ ;
+ ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret <4 x i32> %val
+}
+
; Waterfall for rsrc and soffset, copy for voffset
define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
; GFX8-LABEL: name: raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset
@@ -325,9 +381,68 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
ret <4 x float> %val
}
+define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
+ ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ ;
+ ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
+ %voffset = add i32 %voffset.base, 4095
+ %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret <4 x i32> %val
+}
+
+
declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32 immarg) #0
declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32 immarg) #0
declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32 immarg) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32 immarg) #0
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32 immarg) #0
attributes #0 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
index 60c6268e448cb2..29efdddac39b28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
@@ -17,6 +17,25 @@ main_body:
ret {<4 x float>, <4 x float>, <4 x float>} %r2
}
+;CHECK-LABEL: {{^}}buffer_load_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
+;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc
+;CHECK: s_waitcnt
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_v4i32(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0)
+ %data_glc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 1)
+ %data_slc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 2)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ %fdata_glc = bitcast <4 x i32> %data_glc to <4 x float>
+ %fdata_slc = bitcast <4 x i32> %data_slc to <4 x float>
+ %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %fdata, 0
+ %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %fdata_glc, 1
+ %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %fdata_slc, 2
+ ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
;CHECK-LABEL: {{^}}buffer_load_immoffs:
;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
;CHECK: s_waitcnt
@@ -26,6 +45,16 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_immoffs_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_v4i32(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 42, i32 0, i32 0)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ ret <4 x float> %fdata
+}
+
;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
@@ -43,6 +72,26 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large_v4i32:
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
+;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092
+;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large_v4i32(<4 x i32> inreg) {
+main_body:
+ %d.0 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 60, i32 0)
+ %d.1 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 32764, i32 0)
+ %d.2 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4, i32 36860, i32 0)
+ %fd.0 = bitcast <4 x i32> %d.0 to <4 x float>
+ %fd.1 = bitcast <4 x i32> %d.1 to <4 x float>
+ %fd.2 = bitcast <4 x i32> %d.2 to <4 x float>
+ %d.3 = fadd <4 x float> %fd.0, %fd.1
+ %data = fadd <4 x float> %fd.2, %d.3
+ ret <4 x float> %data
+}
+
;CHECK-LABEL: {{^}}buffer_load_ofs:
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
;CHECK: s_waitcnt
@@ -52,6 +101,16 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_ofs_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_v4i32(<4 x i32> inreg, i32) {
+main_body:
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %1, i32 0, i32 0)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ ret <4 x float> %fdata
+}
+
;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
;CHECK: s_waitcnt
@@ -62,6 +121,17 @@ main_body:
ret <4 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm_v4i32:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm_v4i32(<4 x i32> inreg, i32) {
+main_body:
+ %ofs = add i32 %1, 60
+ %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %ofs, i32 0, i32 0)
+ %fdata = bitcast <4 x i32> %data to <4 x float>
+ ret <4 x float> %fdata
+}
+
;CHECK-LABEL: {{^}}buffer_load_x:
;CHECK: buffer_load_format_x v0, off, s[0:3], 0
;CHECK: s_waitcnt
@@ -83,5 +153,6 @@ main_body:
declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #0
declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32) #0
attributes #0 = { nounwind readonly }
|
7d6bf02 to
e10852f
Compare
jayfoad
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
|
But please edit the commit message to follow usual git style: the first line should be shorter and start with an "AMDGPU" tag. |
e10852f to
5ffd708
Compare
|
And I don't think LLPC is anything meaningful to the LLVM community |
Don't state what it can do, state the intrinsic supports |
Well it's an open source frontend that generates LLVM IR: https://github.com/GPUOpen-Drivers/llpc |
5ffd708 to
a2a2d96
Compare
…as return type. This went unnoticed until CreateIntrinsic with implicit mangling was used.
…4i32 as return type. This went unnoticed until CreateIntrinsic with implicit mangling was used.